Lab 6: Machine Learning in Hydrology

Using Tidymodels & CAMELS Data

Author

Samantha Nauman

Published

April 1, 2025

Lab Set Up, Data Download, Doc PDF

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.6     ✔ rsample      1.2.1
✔ dials        1.3.0     ✔ tune         1.2.1
✔ infer        1.0.7     ✔ workflows    1.1.4
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.2.1     ✔ yardstick    1.3.1
✔ recipes      1.1.0     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
library(powerjoin)
library(glue)
library(vip)

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi
library(baguette)
library(ggplot2)
root  <- 'https://gdex.ucar.edu/dataset/camels/file'
download.file('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf', 
              'data/camels_attributes_v2.0.pdf')

Getting Basin Characteristics

types <- c("clim", "geol", "soil", "topo", "vege", "hydro")

# Where the files live online ...
remote_files  <- glue('{root}/camels_{types}.txt')
# where we want to download the data ...
local_files   <- glue('data/camels_{types}.txt')

walk2(remote_files, local_files, download.file, quiet = TRUE)

# Read and merge data
camels <- map(local_files, read_delim, show_col_types = FALSE) 

camels <- power_full_join(camels ,by = 'gauge_id')

Question 1: Your Turn

# making a map of the sites
ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = q_mean)) +
  scale_color_gradient(low = "pink", high = "dodgerblue") +
  ggthemes::theme_map()

  • “zero_q_freq” represents the frequency of days with Q = 0 mm/day, reported as a percentage, where Q is daily discharge.

Question 2: Aridity and P_mean Maps

aridity_map <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "black") +
  geom_point(aes(color = aridity)) +
  scale_color_gradient(low = "aquamarine", high = "darkorchid") +
  labs(x = "Longitude", y = "Latitude", title = "Aridity Gradient Across the U.S.") +
  ggthemes::theme_map()

p_mean_map <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "black") +
  geom_point(aes(color = p_mean)) +
  scale_color_gradient(low = "bisque2", high = "darkolivegreen") +
  labs(x = "Longitude", y = "Latitude", title = "Mean Daily Precipitation Across the U.S.") +
  ggthemes::theme_map()

sites_map <- ggpubr::ggarrange(aridity_map, p_mean_map, ncol = 2)
print(sites_map)

ggplot2::ggsave(filename = "imgs/sites_map.png", plot = sites_map, width = 10, height = 6)

Model Preparation

camels |> 
  select(aridity, p_mean, q_mean) |> 
  drop_na() |> 
  cor()
           aridity     p_mean     q_mean
aridity  1.0000000 -0.7550090 -0.5817771
p_mean  -0.7550090  1.0000000  0.8865757
q_mean  -0.5817771  0.8865757  1.0000000
# rainfall and mean flow have strong correlation, inverse correlation with aridity and rainfall 

Visual EDA

# Create a scatter plot of aridity vs rainfall
ggplot(camels, aes(x = aridity, y = p_mean)) +
  # Add points colored by mean flow
  geom_point(aes(color = q_mean)) +
  # Add a linear regression line
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  # Apply the viridis color scale
  scale_color_viridis_c() +
  # Add a title, axis labels, and theme (w/ legend on the bottom)
  theme_linedraw() + 
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

# exponential decay, NOT linear... so

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  scale_color_viridis_c() +
  # Apply log transformations to the x and y axes
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

# this one shows a more linear log-log relationship betweeen aridity and rainfall

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  # Apply a log transformation to the color scale
  scale_color_viridis_c(trans = "log") +
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom",
        # Expand the legend width ...
        legend.key.width = unit(2.5, "cm"),
        legend.key.height = unit(.5, "cm")) + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow") 
`geom_smooth()` using formula = 'y ~ x'

Model Building

Lets start by splitting the data

set.seed(123)
# Bad form to perform simple transformations on the outcome variable within a recipe. So, we'll do it here.
camels <- camels |> 
  mutate(logQmean = log(q_mean))

# Generate the split
camels_split <- initial_split(camels, prop = 0.8)
camels_train <- training(camels_split)
camels_test  <- testing(camels_split)

camels_cv <- vfold_cv(camels_train, v = 10)

Preprocessor: Recipe

# Create a recipe to preprocess the data
rec <-  recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%
  # Log transform the predictor variables (aridity and p_mean)
  step_log(all_predictors()) %>%
  # Add an interaction term between aridity and p_mean
  step_interact(terms = ~ aridity:p_mean) |> 
  # Drop any rows with missing values in the pred
  step_naomit(all_predictors(), all_outcomes())

Naive base lm approach:

# Prepare the data
baked_data <- prep(rec, camels_train) |> 
  bake(new_data = NULL)

# Interaction with lm
#  Base lm sets interaction terms with the * symbol
lm_base <- lm(logQmean ~ aridity * p_mean, data = baked_data)
summary(lm_base)

Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.77586    0.16365 -10.852  < 2e-16 ***
aridity        -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean          1.48438    0.15511   9.570  < 2e-16 ***
aridity:p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
# Check the recipe
# Sanity Interaction term from recipe ... these should be equal!!
summary(lm(logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))

Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean, 
    data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -1.77586    0.16365 -10.852  < 2e-16 ***
aridity          -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean            1.48438    0.15511   9.570  < 2e-16 ***
aridity_x_p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16

Correct way to evaluate the model on test data: prep -> bake -> predict

test_data <-  bake(prep(rec), new_data = camels_test)
test_data$lm_pred <- predict(lm_base, newdata = test_data)

Model Evaluation: Statistical and Visual

metrics(test_data, truth = logQmean, estimate = lm_pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(test_data, aes(x = logQmean, y = lm_pred, colour = aridity)) +
  # Apply a gradient color scale
  scale_color_gradient2(low = "brown", mid = "orange", high = "darkgreen") +
  geom_point() +
  geom_abline(linetype = 2) +
  theme_linedraw() + 
  labs(title = "Linear Model: Observed vs Predicted",
       x = "Observed Log Mean Flow",
       y = "Predicted Log Mean Flow",
       color = "Aridity")

Using a workflow instead

# Define model
lm_model <- linear_reg() %>%
  # define the engine
  set_engine("lm") %>%
  # define the mode
  set_mode("regression")

# Instantiate a workflow ...
lm_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(lm_model) %>%
  # Fit the model to the training data
  fit(data = camels_train) 

# Extract the model coefficients from the workflow
summary(extract_fit_engine(lm_wf))$coefficients
                   Estimate Std. Error    t value     Pr(>|t|)
(Intercept)      -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity          -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean            1.4843771 0.15511117   9.569762 4.022500e-20
aridity_x_p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
# From the base implementation
summary(lm_base)$coefficients
                 Estimate Std. Error    t value     Pr(>|t|)
(Intercept)    -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity        -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean          1.4843771 0.15511117   9.569762 4.022500e-20
aridity:p_mean  0.1048449 0.07198145   1.456555 1.458304e-01

Making Predictions

#
lm_data <- augment(lm_wf, new_data = camels_test)
dim(lm_data)
[1] 135  61

Model Evaluation: Statistical and Visual

# extracting default metrics between the observed and predicted mean streamflow values
metrics(lm_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(lm_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

Switch it up!

# random forest model to predict mean streamflow
library(baguette)
rf_model <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(rf_model) %>%
  # Fit the model
  fit(data = camels_train) 

# make predictions on the test data
rf_data <- augment(rf_wf, new_data = camels_test)
dim(rf_data)
[1] 135  60

Model Evaluation: Statistical and Visual

# create a scatter plot of the observed vs predicted values, colored by aridity
metrics(rf_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.592
2 rsq     standard       0.736
3 mae     standard       0.367
ggplot(rf_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

A workflowset approach

# this compared multiple models by defining a set of workflows, fit them to the same data, and evaluate their performance using a common metric
wf <- workflow_set(list(rec), list(lm_model, rf_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv) 

autoplot(wf)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 4 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     1
2 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     1
3 recipe_rand_fore… Prepro… rmse    0.565  0.0249    10 recipe       rand…     2
4 recipe_rand_fore… Prepro… rsq     0.769  0.0261    10 recipe       rand…     2

Question 3: Your Turn Model Building

# build a xgboost regression model using boost_tree
xgb_mod <- boost_tree(mode = "regression",
                            trees = 1000) |>
  set_engine('xgboost')

# build a neural network model using the nnet engine fron the baguette package using the bag_mlp function
nn_mod <- bag_mlp() %>%
  set_mode("regression") %>%
  set_engine('nnet')

# add to the above workflow
xgb_workflow <- workflow() %>%
  add_recipe(rec) %>%
  add_model(xgb_mod) %>%
  fit(data = camels_train) %>%
  augment(camels_train)

nn_workflow <- workflow() %>%
  add_recipe(rec) %>%
  add_model(nn_mod) %>%
  fit(data = camels_train) %>%
  augment(camels_train) 

# evaluate the model and compare it to the linear and random forest models
metrics(xgb_workflow, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard     0.00292
2 rsq     standard     1.00   
3 mae     standard     0.00211
metrics(nn_workflow, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.478
2 rsq     standard       0.837
3 mae     standard       0.299
ggplot(xgb_workflow, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  labs(title = "XgBoost Model") +
  theme_bw()
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).

ggplot(nn_workflow, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  labs(title = "Neural Network Model") +
  theme_linedraw()
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).

autoplot(wf)

  • Out of the linear regression, random forest, boosted tree, and neural network models, I would move forward with the xgboost model. This is because the results are right on the line of best fit (1:1), meaning the metrics have more significance and are less likely to have residual errors.

Question 4: Build your own

Data Splitting

# set a new seed for reproducible
set.seed(123456) # new sequence

# create initial 75% training and 25% testing split and extract the sets
camels_strata <- initial_split(camels, prop = .75)

train_camels <- training(camels_strata)
test_camels <- testing(camels_strata)

# build a 10-fold CV dataset
camels_folds <-
  vfold_cv(train_camels, v = 10)

Recipe

# define a formula you want to use
formula <- logQmean ~ p_mean + aridity + high_prec_dur

# build a recipe
train_camels <- na.omit(train_camels)
rec_camels <-  recipe(logQmean ~ p_mean + aridity + high_prec_dur, data = train_camels) %>%
  step_log(all_predictors()) %>%
  step_interact(terms = ~ aridity:p_mean) %>%
  step_naomit(all_predictors(), all_outcomes()) %>%
  step_zv(all_predictors()) 

# prep the data
baked_camels <- prep(rec_camels, train_camels) %>%
  bake(new_data = NULL)

# check the recipe (should be zero)
sum(is.na(baked_camels))
[1] 0
sum(is.infinite(as.matrix(baked_camels))) 
[1] 0
  • The formula was chose was based on the inclusion of the predictor variables that I believe influence mean daily discharge: p_mean, aridity, and logQmean. Precipitation adds water to the system, while aridity indicates how dry an area is (drier areas usually have lower logQmean). I also expect that more frequent heavy rain events (high_prec_dur) will lead to higher average discharge.

Define 3 Models

# define a random forest model
q4_rf_mod <- rand_forest() %>%
  set_engine("ranger") %>%
  set_mode("regression")

# 2 other models of choice
q4_xgb_mod <- boost_tree() %>%
  set_engine("xgboost") %>%
  set_mode("regression") 

q4_lm_mod <- linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression") 

Workflow Set

# create workflow objects, add the recipe, add the models
q4_rf_wf <- workflow() %>%
  add_recipe(rec_camels) %>%
  add_model(q4_rf_mod)

q4_xgb_wf <- workflow() %>%
  add_recipe(rec_camels) %>%
  add_model(q4_xgb_mod)

q4_lm_wf <- workflow() %>%
  add_recipe(rec_camels) %>%
  add_model(q4_lm_mod) 

# fit the model to the resamples
rf_results <- fit_resamples(q4_rf_wf, resamples = camels_folds)
xgb_results <- fit_resamples(q4_xgb_wf, resamples = camels_folds) 
lm_results <- fit_resamples(q4_lm_wf, resamples = camels_folds) 

Evaluation

# use autoplot and rank_results to compare the models
q4_wf <- workflow_set(list(rec_camels), list(q4_rf_mod, q4_xgb_mod, q4_lm_mod)) %>%
  workflow_map('fit_resamples', resamples = camels_cv)

autoplot(q4_wf)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 4 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     1
2 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     1
3 recipe_rand_fore… Prepro… rmse    0.565  0.0249    10 recipe       rand…     2
4 recipe_rand_fore… Prepro… rsq     0.769  0.0261    10 recipe       rand…     2
  • Out of the random forest, linear, and xgboost model, I think that the random forest model is the strongest performer because it shows the lowest RMSE and the highest RSQ compared to the others, capturing the relationship between predictors and response very well. These metrics indicate that the random forest’s predictions are close to the actual values and explain a large variance in the data. This means it consistently makes accurate predictions and captures the underlying patterns in the dataset better than the other models.

Extract and Evaluate

# build a workflow with favorite model, recipe, and training data
final_wf <- workflow() %>%
  add_recipe(rec_camels) %>%
  add_model(q4_rf_mod) %>%
  fit(data = train_camels) # use fit to fit all training data to the model

# use augment to make preditions on the test data
final_wf_data <- augment(final_wf, new_data = test_camels)

#create a plot of the observed vs predicted values
ggplot(final_wf_data, aes(x = .pred, y = logQmean, colour = logQmean)) +
  scale_color_gradient2(low = "blue3", mid = "yellow", high = "chartreuse") +
  geom_point() +
  geom_abline(linetype = 2) +
  labs(title = "Random Forest Model: Observed vs. Predicted",
       x = "Predicted Log Mean Flow",
       y = "Observed Log Mean Flow?")

These results suggest that the random forest model performs well on predicting logQmean based on the predictors chosen. Most of the points are along the 1:1 line, indicating that the model’s predicted values match the observed values closely, accurately capturing the relationship between the predictors and logQmean effectively.